# LOAD
import os
import re
import sqlite3
import datetime
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import cross_validation 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier

database = "C:\\Users\\Daniel\\Dropbox\\Journal (II)\\Replication Files\\Arel-Bundock, Vincent Replication\\replication\\replication_20160118.sqlite"
conn = sqlite3.connect(database)

def get_data(sample, predictors, variables):
    wdi = pd.read_sql('SELECT * FROM wdi', conn)
    guo = pd.read_sql('SELECT * FROM guo', conn)
    sample = pd.read_sql('SELECT * FROM ' + sample, conn)
    distance = pd.read_sql('SELECT * FROM distance', conn)
    political = pd.read_sql('SELECT * FROM predictors_' + predictors, conn)
    dat = pd.merge(sample, guo, how='left')
    dat = pd.merge(dat, distance, how='left')
    dat = pd.merge(dat, wdi, left_on='iso2c_sub', right_on='iso2c')
    dat = pd.merge(dat, political, left_on='iso2c_sub', right_on='iso2c')
    variables = variables + political.columns.tolist()[1:]
    variables = '|'.join(variables)
    cols = [x for x in dat.columns if re.search(variables, x) != None]
    dat = dat[cols]
    dat = dat.set_index(['iso2c_guo', 'id_guo', 'iso2c_sub']).sort_index().reset_index()
    return dat

def get_arrays(dat):
    dat = dat.dropna()
    # variables
    political = pd.read_csv('codebook.csv')
    political = political.variable[political.political==1].tolist()
    political = '|'.join(political)
    # arrays
    y = dat.sub_dummy_host.as_matrix()
    X = dat.iloc[:, 4:]
    X_apol = X[[x for x in X.columns if re.search(political, x) == None]]
    variables_pol = X.columns.tolist()
    variables_apol = X_apol.columns.tolist()
    X_pol = X.as_matrix()
    X_apol = X_apol.as_matrix()
    arrays = cross_validation.train_test_split(y, X_pol, X_apol, train_size=.5, random_state=1024)
    labels = ['y_train', 'y_test', 'X_pol_train',
              'X_pol_test', 'X_apol_train', 'X_apol_test']
    out = dict(zip(labels, arrays))
    out['variables_pol'] = variables_pol
    out['variables_apol'] = variables_apol
    return out

def estimate(arrays, cls):
    cls.random_state = 1024
    cls.fit(arrays['X_apol_train'], arrays['y_train'])
    score_apol = cls.score(arrays['X_apol_test'], arrays['y_test'])
    cls.fit(arrays['X_pol_train'], arrays['y_train'])
    score_pol = cls.score(arrays['X_pol_test'], arrays['y_test'])
    results = pd.DataFrame([score_apol, score_pol]).T
    results.columns = ['score_apol', 'score_pol']
    return results

class Logistic():
    def fit(self, X, y):
        self.cls = sm.Logit(y, X).fit()
    def score(self, X, y):
        yhat = np.where(self.cls.predict(X) > .5, 1, 0)
        score = np.mean(yhat == y)
        return score

classifiers = {'Random Forest':RandomForestClassifier(n_estimators=50, random_state=1024)}

tables = ['sample', 'sample_klarge', 'sample_hi', 'sample_lo', 'sample_havens',
          'sample_outliers', 'sample_strata']
variables = ['id_guo', 'iso2c.guo', 'iso2c.sub', 'iso_count', 'sub_dummy_host',
             'naics\d', 'distw', '^gdp$']
variables_distance = variables + ['economic_distance', 'administrative_distance']
variables_wdi = variables + ['gdppc', 'pop', 'pop_urban', 'telephone']
variables_nodistance = [x for x in variables if x != 'distw']

print('prepare data')
data = {'baseline' : get_data('sample', 'ksmall', variables),
        'hi' : get_data('sample_hi', 'ksmall', variables),
        'lo' : get_data('sample_lo', 'ksmall', variables),
        'klarge' : get_data('sample_klarge', 'klarge', variables)}
print('prepare arrays')
arrays = {k: get_arrays(data[k]) for k in data.keys()}

print('estimate models')
results = []
for a in arrays.keys():
    for c in classifiers.keys():
        print(a, c)
        tmp = estimate(arrays[a], classifiers[c])
        tmp['sample'] = a
        tmp['classifier'] = c
        results.append(tmp)
results = pd.concat(results)
results.to_csv('tables/table_6.csv', index=False)
